In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
In [2]:
# get rossmann, store, & test csv files as a DataFrame
rossmann_df = pd.read_csv("train.csv")
store_df = pd.read_csv("store.csv")
test_df = pd.read_csv("test.csv")
# preview the data
rossmann_df.head()
Out[2]:
In [4]:
# Open
fig, (axis1) = plt.subplots(1,1,figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek', data=rossmann_df,palette="husl", ax=axis1)
#默认周天以外的店铺状态都为“open”
test_df["Open"][test_df["Open"] != test_df["Open"]] = (test_df["DayOfWeek"] != 7).astype(int)
# Drop Open column
# rossmann_df.drop("Open", axis=1, inplace=True)
# test_df.drop("Open", axis=1, inplace=True)
In [5]:
# Date
rossmann_df['Year'] = rossmann_df['Date'].apply(lambda x: int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x: int(str(x)[5:7]))
test_df['Year'] = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7]))
# Assign Date column to Date(Year-Month) instead of (Year-Month-Day)
#将Date列添加给Date(Year-Month) 取代 (Year-Month-Day)
rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x: (str(x)[:7]))
test_df['Date'] = test_df['Date'].apply(lambda x: (str(x)[:7]))
average_sales = rossmann_df.groupby('Date')["Sales"].mean()
pct_change_sales = rossmann_df.groupby('Date')["Sales"].sum().pct_change()
fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))
# plot average sales over time(year-month)
ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title="Average Sales")
ax1.set_xticks(range(len(average_sales)))
ax1.set_xticklabels(average_sales.index.tolist(), rotation=90)
# plot precent change for sales over time(year-month)
ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap="summer",title="Sales Percent Change")
# ax2.set_xticks(range(len(pct_change_sales)))
# ax2.set_xticklabels(pct_change_sales.index.tolist(), rotation=90)
In [6]:
#比较年份的销售和每年到访的用户数量
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2)
# Drop Date column
# rossmann_df.drop(['Date'], axis=1,inplace=True)
# test_df.drop(['Date'], axis=1,inplace=True)
Out[6]:
In [7]:
# Customers
fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))
# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)
# group by date and get average customers, and precent change
average_customers = rossmann_df.groupby('Date')["Customers"].mean()
# pct_change_customers = rossmann_df.groupby('Date')["Customers"].sum().pct_change()
# Plot average customers over the time
# it should be correlated with the average sales over time
ax = average_customers.plot(legend=True,marker='o', ax=axis2)
ax.set_xticks(range(len(average_customers)))
xlabels = ax.set_xticklabels(average_customers.index.tolist(), rotation=90)
In [8]:
# DayOfWeek
# In both cases where the store is closed and opened
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='DayOfWeek', y='Sales', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis2)
Out[8]:
In [9]:
# Promo
# Plot average sales & customers with/without promo
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Promo', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Promo', y='Customers', data=rossmann_df, ax=axis2)
#可以看出是否进行促销对销售额与顾客量有明显的影响
Out[9]:
In [10]:
# StateHoliday
# StateHoliday column has values 0 & "0", So, we need to merge values with 0 to "0"
rossmann_df["StateHoliday"].loc[rossmann_df["StateHoliday"] == 0] = "0"
# test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"
# Plot
sns.countplot(x='StateHoliday', data=rossmann_df)
# Before
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
mask = (rossmann_df["StateHoliday"] != "0") & (rossmann_df["Sales"] > 0)
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)
Out[10]:
In [11]:
# .... continue with StateHoliday
rossmann_df["StateHoliday"] = rossmann_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
test_df["StateHoliday"] = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)
Out[11]:
In [12]:
# SchoolHoliday
# Plot
sns.countplot(x='SchoolHoliday', data=rossmann_df)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)
Out[12]:
In [13]:
# Sales
fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))
# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)
# Plot sales values
# Notice that values with 0 is mostly because the store was closed
rossmann_df["Sales"].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)
Out[13]:
In [14]:
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
average_sales_customers
Out[14]:
In [15]:
# Using store_df
# 每一家平均的销售和顾客合并进store_data里
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
sales_customers_df = DataFrame({'Store':average_sales_customers.index,
'Sales':average_sales_customers["Sales"], 'Customers': average_sales_customers["Customers"]},
columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')
store_df.head()
Out[15]:
In [16]:
# StoreType
# Plot StoreType, & StoreType Vs average sales and customers
sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1)
sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)
Out[16]:
In [17]:
# Assortment
# Plot Assortment, & Assortment Vs average sales and customers
sns.countplot(x='Assortment', data=store_df, order=['a','b','c'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1)
sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)
Out[17]:
In [18]:
# Promo2
# Plot Promo2, & Promo2 Vs average sales and customers
sns.countplot(x='Promo2', data=store_df)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1)
sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)
Out[18]:
In [19]:
# CompetitionDistance
# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())
# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))
Out[19]:
In [20]:
# What happened to the average sales of a store over time when competition started?
# Example: the average sales for store_id = 6 has dramatically decreased since the competition started
store_id = 6
store_data = rossmann_df[rossmann_df["Store"] == store_id]
average_store_sales = store_data.groupby('Date')["Sales"].mean()
# Get year, and month when Competition started
y = store_df["CompetitionOpenSinceYear"].loc[store_df["Store"] == store_id].values[0]
m = store_df["CompetitionOpenSinceMonth"].loc[store_df["Store"] == store_id].values[0]
# Plot
ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o')
ax.set_xticks(range(len(average_store_sales)))
ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90)
# Since all data of store sales given in rossmann_df starts with year=2013 till 2015,
# So, we need to check if year>=2013 and y & m aren't NaN values.
if y >= 2013 and y == y and m == m:
plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')
In [21]:
# Risk Analysis
# Analyze the risk of a store; Risk(std) Vs Expected(mean)
# .... countiue using store_data
store_average = store_data["Sales"].mean()
store_std = store_data["Sales"].std()
# Plot
plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20)
# Get min & max mean and std of store sales
# Remember that store_df["Sales"] has the average sales for a store
std_sales = rossmann_df.groupby('Store')["Sales"].std()
min_average = store_df["Sales"].min()
max_average = store_df["Sales"].max()
min_std = std_sales.min()
max_std = std_sales.max()
# Set the x and y limits of the plot
plt.ylim([min_std, max_std])
plt.xlim([min_average, max_average])
# Set the plot axis titles
plt.xlabel('Expected Sales')
plt.ylabel('Risk')
# Set label
label, x, y = "Store {}".format(store_id), store_average, store_std
plt.annotate(
label,
xy = (x, y), xytext = (50, 50),
textcoords = 'offset points', ha = 'right', va = 'bottom',
arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))
Out[21]:
In [22]:
# Correlation
# Visualize the Correlation between stores
store_piv = pd.pivot_table(rossmann_df,values='Sales', index='Date', columns=['Store'],aggfunc='sum')
store_pct_chage = store_piv.pct_change().dropna()
store_piv.head()
Out[22]:
In [23]:
# .... continue Correlation
# Plot correlation between range of stores
start_store = 1
end_store = 5
fig, (axis1) = plt.subplots(1,1,figsize=(15,5))
# using summation of sales values for each store
sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)
# using percent change for each store
# sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2
Out[23]:
In [24]:
# Notice that test_df has only year=2015, and months 8 & 9
# drop Year and Month
rossmann_df.drop(["Year", "Month"], axis=1, inplace=True)
test_df.drop(["Year", "Month"], axis=1, inplace=True)
# Create dummy varibales for DayOfWeek
day_dummies_rossmann = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)
day_dummies_test = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)
rossmann_df = rossmann_df.join(day_dummies_rossmann)
test_df = test_df.join(day_dummies_test)
rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)
In [25]:
# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df["Open"] != 0]
# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop(["Open","Customers", "Date"], axis=1, inplace=True)
In [26]:
# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df["Id"][test_df["Open"] == 0].values
# remove all rows(store,date) that were closed
test_df = test_df[test_df["Open"] != 0]
# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)
In [28]:
# Loop through each store,
# train the model using the data of current store, and predict it's sales values.
rossmann_dic = dict(list(rossmann_df.groupby('Store')))
test_dic = dict(list(test_df.groupby('Store')))
submission = Series()
scores = []
for i in test_dic:
# current store
store = rossmann_dic[i]
# define training and testing sets
X_train = store.drop(["Sales","Store"],axis=1)
Y_train = store["Sales"]
X_test = test_dic[i].copy()
store_ids = X_test["Id"]
X_test.drop(["Id","Store"], axis=1,inplace=True)
# Linear Regression
lreg = LinearRegression()
lreg.fit(X_train, Y_train)
Y_pred = lreg.predict(X_test)
scores.append(lreg.score(X_train, Y_train))
# Xgboost
params = {"objective": "reg:linear", "max_depth": 10}
T_train_xgb = xgb.DMatrix(X_train, Y_train)
X_test_xgb = xgb.DMatrix(X_test)
gbm = xgb.train(params, T_train_xgb, 100)
Y_pred = gbm.predict(X_test_xgb)
# append predicted values of current store to submission
submission = submission.append(Series(Y_pred, index=store_ids))
# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))
# save to csv file
submission = pd.DataFrame({ "Id": submission.index, "Sales": submission.values})
submission.to_csv('rossmann.csv', index=False)
In [ ]: